***** Clean quote data


clear 
clear matrix
set memory 1000m
set more off
cap log close



cd "/Users/..."

global do_file="‎⁨/Users/.../do_file"
global log_file="/Users/.../log_file"
global raw_data="/Users/.../raw_data⁩⁩"
global working_data="/Users/.../working_data"
global results="/Users/.../results"



log using log_file/clean_042519.log, replace




** clean the quotes data
use raw_data/quote_order_all.dta, clear


tab ic_code, m

gen ic_code_num=1 if ic_code=="ASTP"
replace ic_code_num=2 if ic_code=="BHCX"
replace ic_code_num=3 if ic_code=="CIC"
replace ic_code_num=4 if ic_code=="ab"
replace ic_code_num=5 if ic_code=="anxin"
replace ic_code_num=6 if ic_code=="axatp"
replace ic_code_num=7 if ic_code=="baobiao"
replace ic_code_num=8 if ic_code=="bbcx"
replace ic_code_num=9 if ic_code=="chinalife"
replace ic_code_num=10 if ic_code=="cpic"
replace ic_code_num=11 if ic_code=="dadi"
replace ic_code_num=12 if ic_code=="funde"
replace ic_code_num=13 if ic_code=="libao"
replace ic_code_num=14 if ic_code=="picc"
replace ic_code_num=15 if ic_code=="pinan" | ic_code=="pingan" 
replace ic_code_num=16 if ic_code=="samsumg" |  ic_code=="samsung"
replace ic_code_num=17 if ic_code=="sinosafe"
replace ic_code_num=18 if ic_code=="sinosig"
replace ic_code_num=19 if ic_code=="taiping"
replace ic_code_num=20 if ic_code=="tianan"
replace ic_code_num=21 if ic_code=="yongcheng"

tab ic_code_num, m



order user_id insurance_start_date vin
sort user_id insurance_start_date


bysort user_id: gen user_id_temp=_n
count if user_id_temp==1
count if price_uuid~=""
* 53,808 individuals who had at least one quote
* 4401 purchase


* first see how many cars each consumer may have quoted
bysort user_id vin: gen temp=_n
replace temp=0 if temp>1

bysort user_id: egen total_car=total(temp)

tab total_car if temp==1 // one individual may quote for different cars
drop if total_car>1
drop temp

drop user_id_temp
bysort user_id: gen user_id_temp=_n
count if user_id_temp==1
count if price_uuid~=""
* after this step, has 50,786 individuals
* 3025 purchase

sort user_id insurance_start_date

count if total_fee>0 & total_fee~=. & jiaoq_fee==.
*list total_fee jiaoq_fee if total_fee>0 & total_fee~=. & jiaoq_fee==.


count if total_fee==0
count if jiaoq_fee==.

drop if total_fee==0
drop if jiaoq_fee==.  
// it's possible to have jiaoq_fee=0 if the person only bought commercial insurance
// but it does not make sense to have jiaoq_fee=.


drop user_id_temp
bysort user_id: gen user_id_temp=_n
count if user_id_temp==1
count if price_uuid~=""
* after this step, has 42,863 individuals
* 2975 purchase

count if insurance_start_date==.
// does insurance start date matter? 
// when getting quote, may not specify the starting date of the plan

gen quote_created_date=substr(quote_created_time, 1, 10)

order user_id quote_created_date vin


** there are some user_id use the same vin number??

sort vin user_id

bysort vin user_id: gen temp_check=_n
replace temp_check=0 if temp_check>1

bysort vin: egen temp_check_total=total(temp_check)

tab temp_check_total if temp_check_total~=1
tab temp_check_total if vin~="" & temp_check_total~=1
tab temp_check_total if price_uuid~=""

*tab total_fee if temp_check_total==833
*list if temp_check_total>1 & identity~=""


drop if vin~="" & temp_check_total>7

drop temp_check temp_check_total

drop user_id_temp
bysort user_id: gen user_id_temp=_n
count if user_id_temp==1
count if price_uuid~=""
* after this step, only has 30,629 individuals
* 2907 purchase


** some user buy insurance many times for the same car?


gen temp=1 if price_uuid~=""
bysort user_id quote_created_date vin: egen temp1=total(temp)

tab temp1

drop if temp1>1

drop user_id_temp
bysort user_id: gen user_id_temp=_n
count if user_id_temp==1
count if price_uuid~=""
* after this step, only has 30,538 individuals
* 1624 purchase


drop user_id_temp temp temp1

****************** create averages

sort user_id quote_created_date

* aggregate all other additional insurance products
egen total_other_fee=rowtotal(sy_boli sy_sheshui sy_sheshui_bj sy_siji sy_siji_bj sy_chengke sy_chengke_bj sy_huahen sy_huahen_bj sy_ziran sy_ziran_bj ///
				     sy_zhuanxiu sy_hc_jingshensunshi_bj sy_hc_jingshensunshi sy_hc_xiulichang sy_hc_sanfangteyue sy_hc_feiyongbuchang sy_hc_huowuzeren ///
					 sy_hc_shebeisunshi sy_chedeng sy_renyuan_bj)

replace premium_fee=0 if premium_fee==.	


gen jiaoq_amt=jiaoq_fee
bysort user_id quote_created_date: egen temp_jiaoq_amt=max(jiaoq_amt)	
replace jiaoq_amt=temp_jiaoq_amt if jiaoq_amt==0
drop temp_jiaoq_amt

bysort user_id: egen temp_jiaoq_amt=max(jiaoq_amt)
replace jiaoq_amt=temp_jiaoq_amt if jiaoq_amt==0 
drop temp_jiaoq_amt		

tab jiaoq_amt	 
					 
* average over user_id and quote date 					 
bysort user_id quote_created_date: egen avg_total_fee=mean(total_fee)
bysort user_id quote_created_date: egen avg_jiaoq=mean(jiaoq_fee)           // mandatory -- Compulsory insurance for vehicle traffic accident liability
bysort user_id quote_created_date: egen avg_premium=mean(premium_fee)       // voluntary -- additional coverage
bysort user_id quote_created_date: egen avg_chesun=mean(sy_chesun) 		    // similar to collision insurance, protect the driver
bysort user_id quote_created_date: egen avg_chesun_bj=mean(sy_chesun_bj)    // no deductible
bysort user_id quote_created_date: egen avg_sanzhe=mean(sy_sanzhe)    	    // third-party liability insurance (compliment to jiaoq)
bysort user_id quote_created_date: egen avg_sanzhe_bj=mean(sy_sanzhe_bj)    // no deductible
bysort user_id quote_created_date: egen avg_daoq=mean(sy_daoq) 			    // deal with theft or vandalism, similar to comprehensive insurance
bysort user_id quote_created_date: egen avg_daoq_bj=mean(sy_daoq_bj)	    // no deductible
bysort user_id quote_created_date: egen avg_other_fee=mean(total_other_fee) // all other insurance products


* gen dummy variables for different insurance products
gen jiaoq_dum=1 if jiaoq_fee~=0
replace jiaoq_dum=0 if jiaoq_fee==0

count if premium_fee==.
gen sy_dum=1 if premium_fee~=0 
replace sy_dum=0 if sy_dum==.

count if sy_chesun==.
gen chesun_dum=1 if sy_chesun~=0
replace chesun_dum=0 if chesun_dum==.

count if sy_chesun_bj==.
gen chesun_bj_dum=1 if sy_chesun_bj~=0
replace chesun_bj_dum=0 if chesun_bj_dum==.

count if sy_sanzhe==.
gen sanzhe_dum=1 if sy_sanzhe~=0
replace sanzhe_dum=0 if sanzhe_dum==.

count if sy_sanzhe_bj==.
gen sanzhe_bj_dum=1 if sy_sanzhe_bj~=0
replace sanzhe_bj_dum=0 if sanzhe_bj_dum==.

count if sy_daoq==.
gen daoq_dum=1 if sy_daoq~=0
replace daoq_dum=0 if daoq_dum==.

count if sy_daoq_bj==.
gen daoq_bj_dum=1 if sy_daoq_bj~=0
replace daoq_bj_dum=0 if daoq_bj_dum==.

count if total_other_fee==.
gen other_ins=1 if total_other_fee~=0
replace other_ins=0 if other_ins==.


* average over user_id and quote date 					 
bysort user_id quote_created_date: egen avg_jiaoq_dum=mean(jiaoq_dum)       
bysort user_id quote_created_date: egen avg_sy_dum=mean(sy_dum)   
bysort user_id quote_created_date: egen avg_chesun_dum=mean(chesun_dum) 		
bysort user_id quote_created_date: egen avg_chesun_bj_dum=mean(chesun_bj_dum) 
bysort user_id quote_created_date: egen avg_sanzhe_dum=mean(sanzhe_dum)    
bysort user_id quote_created_date: egen avg_sanzhe_bj_dum=mean(sanzhe_bj_dum)
bysort user_id quote_created_date: egen avg_daoq_dum=mean(daoq_dum) 			
bysort user_id quote_created_date: egen avg_daoq_bj_dum=mean(daoq_bj_dum)
bysort user_id quote_created_date: egen avg_other_ins=mean(other_ins)



******************* Now create purchase data


gen temp=1 if price_uuid~=""
bysort user_id quote_created_date: egen order_ins=total(temp)

count if temp==1 // 1624 purchases remain
tab order_ins 

gen temp_ic=ic_code_num if temp==1
gen temp_total_fee=total_fee if temp==1
gen temp_jiaoq_fee=jiaoq_fee if temp==1
gen temp_premium_fee=premium_fee if temp==1
gen temp_chesun=sy_chesun if temp==1
gen temp_chesun_bj=sy_chesun_bj if temp==1
gen temp_sanzhe=sy_sanzhe if temp==1
gen temp_sanzhe_bj=sy_sanzhe_bj if temp==1
gen temp_daoq=sy_daoq if temp==1
gen temp_daoq_bj=sy_daoq_bj if temp==1
gen temp_other_fee=total_other_fee if temp==1

bysort user_id quote_created_date: egen order_ic=total(temp_ic) if order_ins==1
drop temp_ic

bysort user_id quote_created_date: egen order_total_fee=total(temp_total_fee) if order_ins==1
drop temp_total_fee

bysort user_id quote_created_date: egen order_jiaoq_fee=total(temp_jiaoq_fee) if order_ins==1
drop temp_jiaoq_fee

bysort user_id quote_created_date: egen order_premium_fee=total(temp_premium_fee) if order_ins==1
drop temp_premium_fee

bysort user_id quote_created_date: egen order_chesun=total(temp_chesun) if order_ins==1
drop temp_chesun

bysort user_id quote_created_date: egen order_chesun_bj=total(temp_chesun_bj) if order_ins==1
drop temp_chesun_bj

bysort user_id quote_created_date: egen order_sanzhe=total(temp_sanzhe) if order_ins==1
drop temp_sanzhe

bysort user_id quote_created_date: egen order_sanzhe_bj=total(temp_sanzhe_bj) if order_ins==1
drop temp_sanzhe_bj


bysort user_id quote_created_date: egen order_daoq=total(temp_daoq) if order_ins==1
drop temp_daoq

bysort user_id quote_created_date: egen order_daoq_bj=total(temp_daoq_bj) if order_ins==1
drop temp_daoq_bj

bysort user_id quote_created_date: egen order_other_fee=total(temp_other_fee) if order_ins==1
drop temp_other_fee


** generate other information for an order


sort user_id quote_created_date temp

gen temp_new=order_placed_time
replace temp_new=temp_new[_n-1] if user_id==user_id[_n-1] & quote_created_date==quote_created_date[_n-1]

gen order_placed_time_new=clock(temp_new, "YMDhms")
format order_placed_time_new %tc

gen order_placed_date=date(temp_new, "YMDhms")
format order_placed_date %td

drop order_placed_time temp_new
rename order_placed_time_new order_placed_time

gen temp_new=identity
sort user_id temp
replace temp_new=temp_new[_n-1] if user_id==user_id[_n-1] 
rename temp_new user_id_num


gen temp_new=address
sort user_id temp
replace temp_new=temp_new[_n-1] if user_id==user_id[_n-1] 
rename temp_new user_address


*tab price, m
*list if price<0

bysort user_id quote_created_date: egen ins_price=total(price) 
replace ins_price=. if order_ins==0

*tab ins_price
*list if ins_price<0  // get discount greater than the premium
*list if ins_price==0 // get discount equal to the premium

bysort user_id quote_created_date: egen ins_save_price=total(save_price)
replace ins_save_price=. if order_ins==0


sort user_id quote_created_date temp
drop temp


save working_data/quote_all_clean_042719.dta, replace


bysort user_id quote_created_date: keep if _n==1
gen calendar_day=quote_created_date

save working_data/quote_day_level_clean_042719.dta, replace

use working_data/quote_day_level_clean_042719.dta, clear

bysort user_id: gen temp=_n
count if temp==1
// 30538 individual drivers that have quote info, consistent
count if order_ins==1
// 1624 purchase, consistent

*keep if temp==1
*keep user_id




/*
bysort user_id insurance_start_date: egen avg_total_fee=mean(total_fee)
bysort user_id insurance_start_date: egen avg_jiaoq=mean(jiaoq_fee)
bysort user_id insurance_start_date: egen avg_premium=mean(premium_fee)
bysort user_id insurance_start_date: egen avg_chesun=mean(sy_chesun)
bysort user_id insurance_start_date: egen avg_chesun_bj=mean(sy_chesun_bj)
bysort user_id insurance_start_date: egen avg_sanzhe=mean(sy_sanzhe)
bysort user_id insurance_start_date: egen avg_sanzhe_bj=mean(sy_sanzhe_bj)
bysort user_id insurance_start_date: egen avg_daoq=mean(sy_daoq)
bysort user_id insurance_start_date: egen avg_daoq_bj=mean(sy_daoq_bj)



bysort user_id insurance_start_date: drop if _n~=1
drop if insurance_start_date==.


keep user_id insurance_start_date vin avg_total_fee avg_jiaoq avg_premium avg_chesun avg_chesun_bj avg_sanzhe avg_sanzhe_bj avg_daoq avg_daoq_bj


gen insurance_end_date=insurance_start_date[_n+1] if user_id==user_id[_n+1]
replace insurance_end_date=insurance_start_date+365 if user_id~=user_id[_n+1]

format insurance_start_date %tg
format insurance_end_date %tg



save working_data/quote_given_period.dta, replace

use working_data/quote_given_period.dta, clear
keep user_id
bysort user_id: keep if _n==1
save working_data/quote_user_id.dta, replace


use working_data/clean_temp_1203.dta, clear

keep user_id trip_created_date
bysort user_id trip_created_date: keep if _n==1

gen temp=date(trip_created_date, "YMD")
drop trip_created_date
rename temp trip_created_date

merge m:1 user_id using working_data/quote_user_id.dta
keep if _merge==3
